biostats620_final_project

Packages Used

library(tidyverse)
library(excessmort)
library(kableExtra)
library(lubridate)
library(MASS)
library(pdftools)

Question 1: Population Sizes by Age Group and Sex

dat <- puerto_rico_counts

#marginal counts, we may want to collapse some of these (maybe into 10 yr bins?)
t1 <- dat %>%
  group_by(agegroup, sex) %>%
  summarize(count = mean(population), .groups = "drop") %>%
  pivot_wider(names_from = sex, values_from = count, values_fill = 0)

kable(t1)
agegroup female male
0-4 118886.60 124166.89
5-9 128338.20 134027.91
10-14 137254.48 142834.59
15-19 140546.33 145330.13
20-24 136900.96 135803.44
25-29 129586.13 122670.16
30-34 124736.05 114279.74
35-39 123391.99 110548.18
40-44 122339.68 108089.43
45-49 116511.76 102230.66
50-54 111973.38 96919.54
55-59 104562.71 89331.28
60-64 94151.71 80090.52
65-69 82889.24 69814.59
70-74 67546.00 55648.71
75-79 52283.22 41288.09
80-84 35543.43 26154.14
85-Inf 34932.77 21848.33
#stratified plots
dat %>% ggplot(aes(date, population)) + 
  geom_line(aes(color = agegroup)) +
  facet_wrap(~sex) +
  labs(title = "Population Size by Age Group and Gender",
       x = "Year",
       y = "Population",
       color = "Age Group") 

weekly_deaths <- dat %>%
  filter(year(date) < 2017) %>% 
  mutate(mmwr_week = epiweek(date), 
         mmwr_year = epiyear(date)) %>% 
  group_by(agegroup, sex, mmwr_week, mmwr_year) %>%
  summarise(total_deaths = sum(outcome, na.rm = TRUE), population = mean(population)) %>% 
  ungroup()
`summarise()` has grouped output by 'agegroup', 'sex', 'mmwr_week'. You can
override using the `.groups` argument.
#adding a plot to comment on why older folks are sticking around longer
weekly_deaths |> 
  filter(mmwr_year < 2017) |>
  group_by(mmwr_year) %>%
  summarize(mortality = mean(total_deaths/population)*1000, year = mmwr_year) %>% 
  ggplot(aes(x=year, y= mortality)) +
  geom_line(color = 'blue') +
  labs(title = "Yearly Mortality Rates (per 1,000): 1985-2016",
       x = "Year",
       y = "Rate")
`summarise()` has grouped output by 'mmwr_year'. You can override using the
`.groups` argument.

Considering the population size of Puerto Rico from 1985 to 2022, we see differing trends across age groups. For both males and females, the population size for older age groups is increasing, while younger age groups are decreasing. We see this particularly in the 0-4 age group for both sexes.

When examining the general trend in mortality rates over time, we see a decrease from 1985 to 2016. This could help to explain why we see increased population sizes for older age groups, potentially due to healthier lifestyles and improvements in medical care access and quality.

Question 2: Expected mortality Before 2017

t2 <- weekly_deaths %>% 
  # looking at trends/ estimate what a typical week looks like across years
  group_by(mmwr_week, agegroup, sex) %>% 
  summarise(mean_deaths = mean(total_deaths),
            sd = sd(total_deaths)) %>% 
  ungroup() 
`summarise()` has grouped output by 'mmwr_week', 'agegroup'. You can override
using the `.groups` argument.
kable(head(t2, 10))
mmwr_week agegroup sex mean_deaths sd
1 0-4 female 6.09375 2.8438885
1 0-4 male 7.15625 4.5302523
1 5-9 female 0.50000 0.8424235
1 5-9 male 0.59375 0.8370214
1 10-14 female 0.56250 0.8775883
1 10-14 male 0.90625 0.8175248
1 15-19 female 0.96875 1.2822454
1 15-19 male 3.53125 1.8136467
1 20-24 female 1.40625 1.2406911
1 20-24 male 6.40625 2.3808290
# some age groups have similar trends
t2 %>% 
  ggplot(aes(x = mmwr_week, y = mean_deaths, color = sex)) + 
  geom_line() + 
  geom_ribbon(aes(ymin = mean_deaths - sd,
                  ymax = mean_deaths + sd, 
                  fill = sex), alpha = 0.2, color = NA) +
  facet_wrap(~ agegroup, scales = "free_y") +
  labs(title = "Trend of Mean Mortality by Age and Gender", 
       x = "MMWR Week", 
       y = "Mean Deaths")

# creating a df with larger age groups
new_dat <- dat %>% 
  mutate(mmwr_week = epiweek(date), 
         mmwr_year = epiyear(date),
         agegroup_new = case_when(
           agegroup %in% c("0-4", "5-9", "10-14", "15-19", "20-24") ~ "0-24",
           agegroup %in% c("25-29", "30-34", "35-39", "40-44", "45-49",
                           "50-54", "55-59", "60-64") ~ "25-64",
           agegroup %in% c("65-69", "70-74", "75-79", "80-84") ~ "65-84",
           agegroup == "85-Inf" ~ "85+")) %>% 
  group_by(agegroup_new, sex, mmwr_week, mmwr_year) %>%
  summarise(total_deaths = sum(outcome, na.rm = TRUE),  population = mean(population)) %>% 
  ungroup()
`summarise()` has grouped output by 'agegroup_new', 'sex', 'mmwr_week'. You can
override using the `.groups` argument.
t3 <- new_dat %>% 
  # looking at trends/ estimate what a typical week looks like across years
  group_by(mmwr_week, agegroup_new, sex) %>% 
  summarise(mean_deaths = mean(total_deaths),
            sd = sd(total_deaths)) %>% 
  ungroup() 
`summarise()` has grouped output by 'mmwr_week', 'agegroup_new'. You can
override using the `.groups` argument.
kable(head(t3, 10))
mmwr_week agegroup_new sex mean_deaths sd
1 0-24 female 8.736842 4.195778
1 0-24 male 17.000000 7.128170
1 25-64 female 50.631579 7.290946
1 25-64 male 108.236842 18.397171
1 65-84 female 113.763158 20.674353
1 65-84 male 141.921053 26.324772
1 85+ female 80.078947 26.970837
1 85+ male 60.921053 21.364762
2 0-24 female 9.289474 5.598783
2 0-24 male 18.500000 9.146702
t3 %>% 
  ggplot(aes(x = mmwr_week, y = mean_deaths, color = sex)) + 
  geom_line() + 
  geom_ribbon(aes(ymin = mean_deaths - sd,
                  ymax = mean_deaths + sd,
                  fill = sex), alpha = 0.2, color = NA) +
  facet_wrap(~ agegroup_new, scales = "free_y") +
  labs(title = "Trend of Mean Mortality by Age and Gender", 
       x = "MMWR Week", 
       y = "Mean Deaths")

Linear Model:

# pre-2017 data
model_dataset <- new_dat %>% 
  filter(mmwr_year < 2017) %>% 
  mutate(rate = total_deaths/population, 
         age = as.factor(agegroup_new))

# linear model fit on data before 2017
linear <- lm(rate ~ population + mmwr_week + age + sex, data = model_dataset)
summary(linear)

Call:
lm(formula = rate ~ population + mmwr_week + age + sex, data = model_dataset)

Residuals:
       Min         1Q     Median         3Q        Max 
-1.630e-03 -2.093e-04 -4.560e-06  1.816e-04  1.846e-03 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.739e-03  3.805e-05   45.71   <2e-16 ***
population  -1.258e-08  2.525e-10  -49.83   <2e-16 ***
mmwr_week   -2.074e-06  2.001e-07  -10.36   <2e-16 ***
age25-64     1.975e-04  1.186e-05   16.65   <2e-16 ***
age65-84     1.262e-03  2.506e-05   50.35   <2e-16 ***
age85+       1.067e-03  3.112e-05   34.28   <2e-16 ***
sexmale      4.230e-04  6.321e-06   66.92   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.0003484 on 13353 degrees of freedom
Multiple R-squared:  0.917, Adjusted R-squared:  0.917 
F-statistic: 2.46e+04 on 6 and 13353 DF,  p-value: < 2.2e-16
# check if there are negative fitted values
perc_negative <- mean(linear$fitted.values < 0)
perc_negative
[1] 0.08787425

Table of Expected Mortality and SD of Expected Mortality

t4 <- model_dataset %>% 
  mutate(fitted = linear$fitted.values) %>% 
  group_by(mmwr_week, age, sex) %>% 
  summarise(exp_mortality = mean(fitted), 
            sd_exp_mortality = sd(fitted))
`summarise()` has grouped output by 'mmwr_week', 'age'. You can override using
the `.groups` argument.
kable(head(t4))
mmwr_week age sex exp_mortality sd_exp_mortality
1 0-24 female -0.0000486 0.0002048
1 0-24 male 0.0003213 0.0001978
1 25-64 female 0.0004649 0.0001194
1 25-64 male 0.0010539 0.0001066
1 65-84 female 0.0023168 0.0001581
1 65-84 male 0.0028658 0.0001134
## findingoverall mean and sd of mortality 
t4 %>% 
  ungroup() %>% 
  summarise(mean = mean(exp_mortality), 
            sd(sd_exp_mortality))
# A tibble: 1 × 2
     mean `sd(sd_exp_mortality)`
    <dbl>                  <dbl>
1 0.00149              0.0000491

Linear Model Diagnostics

resids <- residuals(linear)

# QQ plot
ggplot(data.frame(residuals = resids), aes(sample = residuals)) +
  stat_qq(size = 1.5, color = "darkblue", alpha = 0.6) +
  stat_qq_line(color = "orange", size = 1) +  
  labs(title = "Normal Q-Q Plot of Model Residuals",
       x = "Theoretical Quantiles",
       y = "Sample Quantiles") +
  theme_minimal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

# Residuals vs Fitted Plot
model_df <- data.frame(
  fitted = fitted(linear),
  residuals = residuals(linear)
)

ggplot(model_df, aes(x = fitted, y = residuals)) +
  geom_point(alpha = 0.4, color = "darkblue", size = 1.5) +
  geom_hline(yintercept = 0, color = "indianred") +
  geom_smooth(method = "loess", se = FALSE, color = "orange", linetype = "solid") +
  labs(title = "Residuals vs Fitted Values",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

Fitting a linear model, we see issues such as violations of homoskedasticity (funnel shape) and normality (tails of qq plot). We also see negative predictions for a non-negative outcome (rate).

Log-Linear Model:

glm_dataset <- new_dat %>% filter(mmwr_year < 2017) 

log_linear <- glm(total_deaths ~ mmwr_week + agegroup_new + sex, 
                  offset = log(population), 
                  family = poisson, 
                  data = glm_dataset)

summary(log_linear)

Call:
glm(formula = total_deaths ~ mmwr_week + agegroup_new + sex, 
    family = poisson, data = glm_dataset, offset = log(population))

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-6.9056  -1.4332  -0.1159   1.2604   8.9106  

Coefficients:
                    Estimate Std. Error  z value Pr(>|z|)    
(Intercept)       -9.498e+00  5.171e-03 -1836.76   <2e-16 ***
mmwr_week         -9.211e-04  6.973e-05   -13.21   <2e-16 ***
agegroup_new25-64  1.970e+00  5.056e-03   389.72   <2e-16 ***
agegroup_new65-84  3.239e+00  4.910e-03   659.68   <2e-16 ***
agegroup_new85+    3.326e+00  5.142e-03   646.95   <2e-16 ***
sexmale            4.929e-01  2.124e-03   232.06   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1128141  on 13359  degrees of freedom
Residual deviance:   49066  on 13354  degrees of freedom
AIC: 125630

Number of Fisher Scoring iterations: 4
#GOF: prob of observing this deviance value given the df
1-pchisq(log_linear$deviance, log_linear$df.residual)
[1] 0

We next fit a log-linear model with mmwr_week, age, and sex again as predictors. We again seek to predict weekly death rate, this time modeling total deaths and using an offset equal to population size for a given week, gender, age, and year. Univariate Wald tests show that each coefficient is significantly associated with our outcome of interest, and a comparison of null and residual deviances shows that our model fits significantly better than the intercept-only model (LRT: 1079075 on 5 degrees of freedom).

Goodness of Fit: Under the null, the residual deviance should be distributed as a \(X^2\) random variable with degrees of freedom equal to the model’s residual degrees of freedom, 13354. We observe a residual deviance value of 49066, which naturally will increase with the total sample size unless the model fits extremely well. The probability of observing such a residual deviance value given that the fit model holds is essentially zero, indicating a lack of model fit.

####Assessing for Overdispersion:

plot(log_linear, which = 3)

overdisp <- log_linear$deviance/log_linear$df.residual
overdisp
[1] 3.67425

There is an upward trend in the scale-location plot for predicted values vs standardized pearson residuals. In a well-fitting model, the ratio of residual deviance to the degrees of freedom should be approximately 1 since the poisson distribution assumes that the mean and variance are exactly equal. In this case, the ratio value is 3.67 indicating that overdispersion is present and that we may be underestimating the standard error of our data.

Negative Binomial Regression:

neg_bin <- glm.nb(total_deaths ~ mmwr_week + agegroup_new + sex + agegroup_new*sex + offset(log(population)), 
                  link = log,
                  data = glm_dataset)

summary(neg_bin)

Call:
glm.nb(formula = total_deaths ~ mmwr_week + agegroup_new + sex + 
    agegroup_new * sex + offset(log(population)), data = glm_dataset, 
    link = log, init.theta = 47.20250994)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.7721  -0.7268  -0.0438   0.6482   8.1872  

Coefficients:
                            Estimate Std. Error   z value Pr(>|z|)    
(Intercept)               -9.6721006  0.0095420 -1013.633  < 2e-16 ***
mmwr_week                 -0.0008334  0.0001157    -7.202 5.95e-13 ***
agegroup_new25-64          1.8978371  0.0103386   183.567  < 2e-16 ***
agegroup_new65-84          3.4817399  0.0100028   348.078  < 2e-16 ***
agegroup_new85+            3.6702479  0.0101586   361.294  < 2e-16 ***
sexmale                    0.7490751  0.0112316    66.693  < 2e-16 ***
agegroup_new25-64:sexmale  0.1322616  0.0130319    10.149  < 2e-16 ***
agegroup_new65-84:sexmale -0.3412163  0.0127178   -26.830  < 2e-16 ***
agegroup_new85+:sexmale   -0.5914260  0.0131053   -45.129  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Negative Binomial(47.2025) family taken to be 1)

    Null deviance: 475292  on 13359  degrees of freedom
Residual deviance:  13764  on 13351  degrees of freedom
AIC: 101244

Number of Fisher Scoring iterations: 1

              Theta:  47.20 
          Std. Err.:  1.02 

 2 x log-likelihood:  -101223.91 
1-pchisq(neg_bin$deviance, neg_bin$df.residual)
[1] 0.006131741

Comparing this model with the log-linear model on AIC, we conclude that the negative binomial model fits better given its lower AIC value (125630 vs 101244). However, the probability of observing such a residual deviance value given that the fit model holds is still very low (p = 0.0061), indicating a lack of model fit despite improvements. A few reasons for this include that we are not accounting for autocorrelation in our data (since we’re looking at the population of Puerto Rico over time), seasonal effects on mortality rates, and the overall downward trend in mortality rates due to factors like improvements in healthcare.

Table of Expected Mortality and SD of Expected Mortality

t5 <- glm_dataset %>% 
  mutate(fitted = neg_bin$fitted.values/population) %>% 
  group_by(mmwr_week, agegroup_new, sex) %>% 
  summarise(exp_mortality = mean(fitted), 
            sd_exp_mortality = sd(fitted))
`summarise()` has grouped output by 'mmwr_week', 'agegroup_new'. You can
override using the `.groups` argument.
kable(head(t5))
mmwr_week agegroup_new sex exp_mortality sd_exp_mortality
1 0-24 female 0.0000630 0
1 0-24 male 0.0001332 0
1 25-64 female 0.0004201 0
1 25-64 male 0.0010141 0
1 65-84 female 0.0020474 0
1 65-84 male 0.0030784 0
t5 %>% 
  ungroup() %>% 
  summarise(mean = mean(exp_mortality), 
            sd(sd_exp_mortality))
# A tibble: 1 × 2
     mean `sd(sd_exp_mortality)`
    <dbl>                  <dbl>
1 0.00148               6.04e-19

Question 3: Periods with Excess Mortality Before 2018

Linear Model

# Looking only at data before 2018 
predict_pre <- new_dat %>% 
  filter(mmwr_year <= 2018) %>%
  arrange(mmwr_year, mmwr_week) %>% 
  mutate(rate = total_deaths/population, 
         age = as.factor(agegroup_new)) %>% 
  group_by(agegroup_new, sex) %>% 
  mutate(week_index = row_number()) %>% 
  ungroup()

# getting predictions
predict_pre$predicted_rate <- predict(linear, newdata = predict_pre)

# excess mortality (obs - predicted)
predict_pre <- predict_pre %>% 
  mutate(predicted_deaths = predicted_rate * population,
         excess_mortality = rate - predicted_rate, 
         excess_deaths = total_deaths - predicted_deaths)

#observed v fitted over time: looking at sex differences
predict_pre %>% ggplot(aes(x = week_index, y = excess_mortality)) +
  geom_smooth(aes(color = sex, group = sex), method = "loess", se = FALSE, span = 0.1) +
  labs(title = "Excess Mortality over Time with Linear Model: Before 2018",
       subtitle = "Differences by Sex",
       x = "Week", y = "Excess Mortality",
       color = 'Sex') +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

predict_pre %>% ggplot(aes(x = week_index, y = excess_mortality)) +
  geom_smooth(aes(color = agegroup_new, group = agegroup_new), method = "loess", se = FALSE, span = 0.1) +
  labs(title = "Excess Mortality over Time with Linear Model: Before 2018",
       subtitle = 'Differences by Age Group',
       x = "Week", y = "Excess Mortality",
       color = 'Age Group') +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

Negative Binomial Model

# Looking only at data before 2018 
predictnb_pre <- new_dat %>% 
  filter(mmwr_year <= 2018) %>%
  arrange(mmwr_year, mmwr_week) %>% 
  mutate(rate = total_deaths/population, 
         age = as.factor(agegroup_new)) %>% 
  group_by(agegroup_new, sex) %>% 
  mutate(week_index = row_number()) %>% 
  ungroup()

# use nb model to predict rates for data before 2018
predictnb_pre$predicted_rate <- predict(neg_bin, type = 'response', newdata = predictnb_pre) / predictnb_pre$population

# excess mortality (obs - predicted)
predictnb_pre <- predictnb_pre %>% 
  mutate(true_rate = total_deaths/population,
         excess_mortality = true_rate - predicted_rate)

#observed v fitted over time: looking at sex differences
predictnb_pre %>% ggplot(aes(x = week_index, y = excess_mortality)) +
  geom_smooth(aes(color = sex, group = sex), method = "loess", se = FALSE, span = 0.1) +
  labs(title = "Excess Mortality over Time with Negtive Binomial Model: Before 2018",
       subtitle = "Differences by Sex",
       x = "Week", y = "Excess Mortality",
       color = 'Sex')+
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

predictnb_pre %>% ggplot(aes(x = week_index, y = excess_mortality)) +
  geom_smooth(aes(color = agegroup_new, group = agegroup_new), method = "loess", se = FALSE, span = 0.1) +
  labs(title = "Excess Mortality over Time with Negative Binomial Model: Before 2018",
       subtitle = 'Differences by Age Group',
       x = "Week", y = "Excess Mortality",
       color = 'Age Group') +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

Based on the predictions from the linear and negative binomial models, there appears to be excess mortality in the year 1985, decreasing from a peak outside of the dataset. This is particularly seen in older age demographics. One potential cause of this could have been the 1985 Puerto Rico floods, which triggered the deadliest single landslide ever recorded in North America.

Recomputing Expected Counts

predictnb_pre %>% filter(week_index >= 65) %>% 
  ggplot(aes(x = week_index, y = excess_mortality)) +
  geom_smooth(aes(color = agegroup_new, group = agegroup_new), method = "loess", se = FALSE, span = 0.1) +
  labs(title = "Excess Mortality over Time with Negative Binomial Model: Before 2018",
       subtitle = 'Removing up to Week 13 of 1986',
       x = "Week", y = "Excess Mortality",
       color = 'Age Group') +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

#table of expected counts:
t6 <- predictnb_pre %>% filter(week_index >= 65) %>% 
  group_by(mmwr_week, agegroup_new, sex) %>% 
  summarise(exp_mortality = mean(predicted_rate), 
            sd_exp_mortality = sd(predicted_rate))
`summarise()` has grouped output by 'mmwr_week', 'agegroup_new'. You can
override using the `.groups` argument.
kable(head(t6))
mmwr_week agegroup_new sex exp_mortality sd_exp_mortality
1 0-24 female 0.0000630 0
1 0-24 male 0.0001332 0
1 25-64 female 0.0004201 0
1 25-64 male 0.0010141 0
1 65-84 female 0.0020474 0
1 65-84 male 0.0030784 0

Question 4: Predictions for 2017-2018

Linear Prediction

# 2017-2018 data used for prediction
predict_lm <- new_dat %>% 
  filter(mmwr_year >= 2017 & mmwr_year <= 2018) %>% 
  mutate(rate = total_deaths/population, 
         age = as.factor(agegroup_new))

# use linear model to predict data 2017 and after
predict_lm$predicted_rate <- predict(linear, newdata = predict_lm)

# excess mortality (obs - predicted)
predict_lm <- predict_lm %>% 
  mutate(predicted_deaths = predicted_rate * population,
         excess_mortality = rate - predicted_rate, 
         excess_deaths = total_deaths - predicted_deaths)

# observed vs fitted plots
ggplot(predict_lm, aes(x = rate, y = predicted_rate, color = agegroup_new)) +
  geom_point(alpha = 0.5) +
  geom_abline(intercept = 0, slope = 1, color = "red", size = 0.8) +
  labs(title = "Predicted vs. Observed Mortality Rate",
       x = "Observed Rate", y = "Predicted Rate",
       color = 'Age Group') +
  theme_minimal()

# plot of excess mortality over weeks
plot_df <- predict_lm %>% 
  arrange(mmwr_year, mmwr_week) %>% 
  mutate(week_index = row_number()) %>% 
  dplyr::select(week_index, rate, predicted_rate, sex, agegroup_new) 
  # pivot_longer(cols = c("rate", "predicted_rate"), 
  #              names_to = "type", 
  #              values_to = "mortality_rate")

ggplot(plot_df, aes(x = week_index, y = rate)) +
  geom_smooth(method = "lm", se = TRUE, linewidth = 1) +
  geom_point(aes(x = week_index, y = rate),  alpha = 0.6, color = "pink") +
  facet_grid(sex ~ agegroup_new) +
  labs(title = "Obs. vs. Fitted Mortality Rates", 
       y = "Obs. Mortality Rate",
       x = "Week Index") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

# comparing excess mortality by gender
predict_lm %>% ggplot(aes(x = mmwr_week, y = excess_mortality)) +
  geom_smooth(aes(color = sex, group = sex), method = "loess", se = FALSE, span = 0.2) +
  facet_wrap(~mmwr_year) +
  labs(title = "Excess Mortality over Time: Linear Model",
       subtitle = "Differences in Excess Mortality by Sex",
       x = "Week", y = "Excess Mortality",
       color = 'Sex') +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

# comparing excess mortality by age
predict_lm %>% ggplot(aes(x = mmwr_week, y = excess_mortality)) +
  geom_smooth(aes(color = agegroup_new, group = agegroup_new), method = "loess", se = FALSE, span = 0.2) +
  facet_wrap(~mmwr_year) +
  labs(title = "Excess Mortality over Time: Linear Model",
       subtitle = "Differences in Excess Mortality by Age Group",
       x = "Week", y = "Excess Mortality",
       color = 'Age Group') +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

In addition to the model violating the linear model assumptions, we can see that the model does not predict well and a better suited model should be used. From the Predicted vs Observed Rates plot, we can see that for the most part, the linear model is overestimating the actual mortality rate.

Looking at excess mortality, we can see that almost all throughout 2017-2018, excess mortality was negative for both males and female, again indicating that the linear model was overestimating mortality rates. However, it is obvious that Hurricane Maria made landfall around weeks 38/39 when excess mortality spiked for both males and females (above 0) showing how the linear model then underestimated the mortality rates given the unexpected natural disaster. There is a difference in excess mortality between males and females thoughtout most of 2017-2018 with male excess mortality being lower, perhaps indicating higher baseline mortality predictions for males.

When looking at age groups, we can tell that excess mortality was negative for the part during 2017-2018 with a spike when Hurricane Maria made landfall. The linear model was especially bad at predicting mortality rates for the 0-24 age groups and there was also no spike in ecess mortality for this age group when Hurricane Maria made Landfall. Same with 25-64 age groups but not as strong.

The hurricane seemed to affect older age groups with both males and females equally affected.

Negative Binomal Prediction:

# 2017-2018 data used for prediction
predict_nb <- new_dat %>% 
  filter(mmwr_year >= 2017 & mmwr_year <= 2018) %>% 
  mutate(age = as.factor(agegroup_new))

# use nb model to predict data 2017 and after
predict_nb$predicted_rate <- predict(neg_bin, type = 'response', newdata = predict_nb) / predict_nb$population

# excess mortality (obs - predicted)
predict_nb <- predict_nb %>% 
  mutate(true_rate = total_deaths/population,
         excess_mortality = true_rate - predicted_rate)

# comparing true vs expected predicted rate (excess mortality)
predict_nb <- predict_nb %>%
  mutate(mmwr_label = paste0(mmwr_year, "-W", stringr::str_pad(mmwr_week, 2, pad = "0")))

# observed vs fitted plots
ggplot(predict_nb, aes(x = true_rate, y = predicted_rate, color = agegroup_new)) +
  geom_point(alpha = 0.5) +
  geom_abline(intercept = 0, slope = 1, color = "red", size = 0.8) +
  labs(title = "Observed vs Predicted Mortality Rate: Negative Binomial Model",
       x = "Observed Rate", y = "Predicted Rate",
       color = 'Age Group') +
  theme_minimal()

Excess Mortality over Time

#observed v fitted over time: looking at sex differences
predict_nb %>% ggplot(aes(x = mmwr_week, y = excess_mortality)) +
  geom_smooth(aes(color = sex, group = sex), method = "loess", se = FALSE, span = 0.2) +
  facet_wrap(~mmwr_year) +
  labs(title = "Excess Mortality over Time: Negative Binomial Model",
       subtitle = "Differences in Excess Mortality by Sex",
       x = "Week", y = "Excess Mortality",
       color = 'Sex') +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

#observed v fitted over time: looking at age group differences
predict_nb %>% ggplot(aes(x = mmwr_week, y = excess_mortality)) +
  geom_smooth(aes(color = agegroup_new, group = agegroup_new), method = "loess", se = FALSE, span = 0.2) +
  facet_wrap(~mmwr_year) +
  labs(title = "Excess Mortality over Time: Negative Binomial Model",
       subtitle = "Differences in Excess Mortality by Age Group",
       x = "Week", y = "Excess Mortality",
       color = 'Age Group') +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

Question 5: Comparison of NYT Data and excessmort Data

pdf_url <- "https://github.com/c2-d2/pr_mort_official/raw/master/data/Mortalidad-RegDem-2015-17-NYT-part1.pdf"

# View(extract_tables(pdf_url, pages = 1)[[1]])

# Download PDF to a temporary file
temp_file <- tempfile(fileext = ".pdf")
download.file(pdf_url, temp_file, mode = "wb")

# extract text from PDF
pdf_text_data <- pdf_text(temp_file)

# view first page of pdf file
# cat(pdf_text_data[1])

extract_table_from_page <- function(page_text) {
  lines <- strsplit(page_text, "\n")[[1]]
  lines <- trimws(lines)
  
  # Keep lines that start with day numbers (1-31)
  table_lines <- grep("^\\d{1,2}\\s", lines, value = TRUE)
  
  # Split on 2+ spaces, assuming table is aligned that way
  split_lines <- str_split_fixed(table_lines, "\\s{2,}", 5)
  
  colnames(split_lines) <- c("Day", "Y2015", "Y2016", "Y2017", "Diff")
  
  # Convert to a tibble and clean up
  as_tibble(split_lines) %>%
    mutate(across(everything(), str_trim)) %>%
    mutate(across(-Day, ~ as.numeric(str_replace_all(., "[^0-9\\.-]", ""))))
}

# Apply the extraction function to each page
tables_list <- lapply(pdf_text_data, extract_table_from_page)

# Combine into one data frame
full_table <- bind_rows(tables_list, .id = "Page")
full_table <- full_table %>% 
  mutate(Month = ifelse(Page == 1, "Sep",
                        ifelse(Page == 2, "Oct",
                               ifelse(Page == 3, "Nov",
                                      ifelse(Page == 4, "Dec",
                                             ifelse(Page == 5, "Jan",
                                                    ifelse(Page == 6, "Feb",
                                                           ifelse(Page == 7, "Mar", 
                                                                  ifelse(Page == 8, "Apr",
                                                                         ifelse(Page == 9, "May",
                                                                                ifelse(Page == 10, "Jun",
                                                                                       ifelse(Page == 11, "Jul", "Aug")))))))))))) %>% 
  dplyr::select(Page, Month, Y2015, Y2016, Y2016, Y2017, Diff) %>% 
  rownames_to_column("index")

# fixing nov data
full_table <- full_table %>% 
  filter(!(index %in% c(84:88)))

# fixing dec data 
full_table[full_table$Page == 4, "Y2017"] <- 0
full_table <- full_table %>% 
  filter(!(index %in% c(115)))

# fixing jan data
full_table <- full_table %>% 
  filter(!(index %in% c(147)))

# fixing feb data
full_table <- full_table %>% 
  filter(!(index %in% c(179)))

full_table[full_table$index == 190, "Y2015"] <- NA
full_table[full_table$index == 190, "Y2016"] <- 70

# fixing march data
full_table <- full_table %>% 
  filter(!(index %in% c(209)))

# fixing april data
full_table <- full_table %>% 
  filter(!(index %in% c(241)))

# fixing may data
full_table <- full_table %>% 
  filter(!(index %in% c(272)))

# fixing june date
full_table <- full_table %>% 
  filter(!(index %in% c(306)))

# fixing july date
full_table <- full_table %>% 
  filter(!(index %in% c(335)))

# fixing aug date
full_table <- full_table %>% 
  filter(!(index %in% c(367)))

# View(full_table[full_table$Page == 12, ]) 

full_table <- full_table %>% 
  mutate(diff = Y2017 - Y2016,
         Page = as.numeric(Page),
         day = c(c(1:30), c(1:31),
         c(1:30), c(1:31), 
         c(1:31), c(1:29), 
         c(1:31), c(1:30), 
         c(1:31), c(1:30), 
         c(1:31), c(1:31))) %>% 
  mutate(Month = factor(Month, levels = c("Jan", "Feb", "Mar", "Apr",
                                          "May", "Jun", "Jul", "Aug",
                                          "Sep", "Oct", "Nov", "Dec")),
         nyt_diff = diff, 
         month = Month) %>% 
  arrange(month) %>% 
  dplyr:: select(month, day, Y2015, Y2016, Y2017, nyt_diff) 


kable(full_table %>% group_by(month) %>% slice(1:3))
month day Y2015 Y2016 Y2017 nyt_diff
Jan 1 107 89 107 18
Jan 2 101 88 108 20
Jan 3 78 79 115 36
Feb 1 66 111 93 -18
Feb 2 114 95 88 -7
Feb 3 95 119 68 -51
Mar 1 82 73 73 0
Mar 2 92 69 89 20
Mar 3 95 67 57 -10
Apr 1 70 79 89 10
Apr 2 85 53 94 41
Apr 3 80 77 81 4
May 1 66 74 62 -12
May 2 87 59 93 34
May 3 77 74 73 -1
Jun 1 68 74 75 1
Jun 2 76 71 75 4
Jun 3 62 70 84 14
Jul 1 71 84 78 -6
Jul 2 67 75 82 7
Jul 3 85 69 80 11
Aug 1 69 65 105 40
Aug 2 72 79 73 -6
Aug 3 74 78 88 10
Sep 1 75 75 92 17
Sep 2 77 67 69 2
Sep 3 67 78 78 0
Oct 1 74 90 101 11
Oct 2 84 67 104 37
Oct 3 77 82 118 36
Nov 1 77 84 78 -6
Nov 2 71 72 90 18
Nov 3 83 74 68 -6
Dec 1 82 77 0 -77
Dec 2 87 88 0 -88
Dec 3 83 67 0 -67
# comparing nyt data w excessmort data
nyt_check_df <- dat %>% 
  mutate(year = year(date), 
         month = month(date, label = TRUE),
         day = day(date)) %>% 
  dplyr::select(day, month, year, outcome) %>% 
  filter(year >= 2015 & year <= 2017) %>% 
  group_by(day, month, year) %>% 
  summarise(total_deaths = sum(outcome)) %>% 
  pivot_wider(names_from = year, values_from = total_deaths) %>% 
  mutate(og_diff = `2017`-`2016`, 
         month = factor(as.character(month), levels = c("Jan", "Feb", "Mar", "Apr",
                                          "May", "Jun", "Jul", "Aug",
                                          "Sep", "Oct", "Nov", "Dec"))) %>%
  arrange(month)
`summarise()` has grouped output by 'day', 'month'. You can override using the
`.groups` argument.
kable(nyt_check_df %>% group_by(month) %>% slice(1:3)) 
day month 2015 2016 2017 og_diff
1 Jan 107 89 107 18
2 Jan 101 88 108 20
3 Jan 78 79 115 36
1 Feb 66 111 93 -18
2 Feb 114 95 88 -7
3 Feb 95 119 68 -51
1 Mar 82 74 73 -1
2 Mar 92 71 89 18
3 Mar 95 67 57 -10
1 Apr 70 79 89 10
2 Apr 85 53 94 41
3 Apr 80 77 81 4
1 May 66 74 62 -12
2 May 87 59 94 35
3 May 77 74 73 -1
1 Jun 68 73 75 2
2 Jun 76 70 75 5
3 Jun 62 70 82 12
1 Jul 71 84 78 -6
2 Jul 67 75 84 9
3 Jul 85 69 82 13
1 Aug 65 69 106 37
2 Aug 79 72 75 3
3 Aug 78 74 89 15
1 Sep 75 75 95 20
2 Sep 77 67 69 2
3 Sep 67 78 80 2
1 Oct 74 90 104 14
2 Oct 85 67 108 41
3 Oct 77 82 122 40
1 Nov 77 84 84 0
2 Nov 71 72 95 23
3 Nov 83 74 72 -2
1 Dec 82 77 91 14
2 Dec 87 88 89 1
3 Dec 83 68 80 12
# calculating differences between the two datasets
final_comparison <- inner_join(nyt_check_df %>% dplyr::select(day, month, og_diff),
                               full_table %>% dplyr::select(day, month, nyt_diff), 
                               by = c("day", "month")) %>% 
  mutate(difference = og_diff - nyt_diff) 

final_comparison$week_index <- c(1:nrow(final_comparison))

mean(final_comparison$difference, na.rm = TRUE)
[1] 10.89863
# plot of differences 

ggplot(final_comparison, aes(x = week_index, y = difference)) +
  geom_line(color = "darkblue") +
  labs(title = "Difference in Calculated Total Deaths Between NYT and excessmort", 
       subtitle = "Difference Between 2016-2017 ", 
       x = "Day Index", 
       y = "Difference") +
  theme_minimal()

In certain months there are big differences between the two data sources when comparing the calculated 2016-2017 total death differences per day. These differences mostly stem from differences in counts in August as well as the NYT data source completely missing 2017 death data.